{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import sys\n",
    "sys.path.append(\"../\")\n",
    "from matplotlib import pylab as plt\n",
    "import numpy as np\n",
    "from glob import glob\n",
    "import json\n",
    "from utils import *\n",
    "from tqdm import tqdm\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!CUDA_VISIBLE_DEVICE="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = [json.loads(s)['code'] for s in open(\"/home/shido/summarization_java/test.json\").readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trn_data = read_pickle(\"../dataset/nl/train.pkl\")\n",
    "vld_data = read_pickle(\"../dataset/nl/valid.pkl\")\n",
    "tst_data = read_pickle(\"../dataset/nl/test.pkl\")\n",
    "code_i2w = read_pickle(\"../dataset/code_i2w.pkl\")\n",
    "code_w2i = read_pickle(\"../dataset/code_w2i.pkl\")\n",
    "nl_i2w = read_pickle(\"../dataset/nl_i2w.pkl\")\n",
    "nl_w2i = read_pickle(\"../dataset/nl_w2i.pkl\")\n",
    "\n",
    "trn_x, trn_y_raw = zip(*trn_data.items())\n",
    "vld_x, vld_y_raw = zip(*vld_data.items())\n",
    "tst_x, tst_y_raw = zip(*tst_data.items())\n",
    "\n",
    "trn_y = [[nl_w2i[t] if t in nl_w2i.keys() else nl_w2i[\"<UNK>\"] for t in l] for l in trn_y_raw]\n",
    "vld_y = [[nl_w2i[t] if t in nl_w2i.keys() else nl_w2i[\"<UNK>\"] for t in l] for l in vld_y_raw]\n",
    "tst_y = [[nl_w2i[t] if t in nl_w2i.keys() else nl_w2i[\"<UNK>\"] for t in l] for l in tst_y_raw]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted([len(x) for x in trn_y])[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(trn_y))\n",
    "print(len(vld_y))\n",
    "print(len(tst_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengthes = [len(traverse_label(read_pickle(x))) for x in tst_x]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = sorted(glob(\"../models/*/history.json\"))\n",
    "dirs = [x.split(\"/\")[-2] for x in files]\n",
    "histories = {name: json.load(open(x)) for name, x in zip(dirs, files)}\n",
    "dirs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = [\n",
    "    \"multiway_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\",\n",
    "    \"childsum_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\",\n",
    "    \"deepcom_dim256_embed256_drop0.5_lr0.001_batch64_epochs30_layer1\",\n",
    "    \"codenn_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 10))\n",
    "# for name, his in [(view, histories[name]) for view, name in zip([\"Ours\", \"Child-Sum\", \"[Hu+, 18]\", \"[Iyer+, 16]\"], names)]:\n",
    "for name, his in histories.items():\n",
    "    if \"drop0.5_lr0.001_batch160_epochs50_layer1\" in name:\n",
    "        plt.plot(his[\"bleu_val\"], \"-\", label=name)\n",
    "#         plt.plot(his[\"loss_val\"], \"-x\", label=name + \"_valid\")\n",
    "plt.grid()\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, his in histories.items():\n",
    "    if \"drop0.5_lr0.001_batch160_epochs50_layer1\" in name:\n",
    "        print(name, \":\", np.mean(his[\"bleus\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.translate.gleu_score import sentence_gleu\n",
    "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
    "def get_gleu(true, pred):\n",
    "    return(sentence_gleu([true], pred))\n",
    "def get_bleu(true, pred):\n",
    "    return(sentence_bleu([true], pred, smoothing_function=SmoothingFunction().method4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BLEU\n",
    "for name, his in histories.items():\n",
    "    if \"drop0.5_lr0.001_batch160_epochs50_layer1\" in name:\n",
    "        trues = histories[name][\"trues\"]\n",
    "        preds = histories[name][\"preds\"]\n",
    "        gleu = np.mean([get_bleu(x, y) for x, y in zip(trues, preds)])\n",
    "        print(name, gleu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GLEU\n",
    "for name, his in histories.items():\n",
    "    if \"drop0.5_lr0.001_batch160_epochs50_layer1\" in name:\n",
    "        trues = histories[name][\"trues\"]\n",
    "        preds = histories[name][\"preds\"]\n",
    "        gleu = np.mean([get_gleu(x, y) for x, y in zip(trues, preds)])\n",
    "        print(name, gleu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = [codes[int(i)] for i in histories[names[0]][\"numbers\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data = [[\" \".join(x) for x in histories[name][\"preds\"]] for name in names] + [[\" \".join(x) for x in histories[names[0]][\"trues\"]]]\n",
    "df_data += [histories[name][\"bleus\"] for name in names] + [codes]\n",
    "df_index = [\"PREDICTION \" + name for name in names] + [\"GROUND TRUTH\"]\n",
    "df_index += [\"BLEU-4 \" + name for name in names] + [\"SOURCE CODE\"]\n",
    "df = pd.DataFrame(data=df_data, index=df_index).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"for_hitachi_lab.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.get([\"SOURCE CODE\", \"GROUND TRUTH\"]).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shido = histories[\"multiway_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\"][\"bleus\"]\n",
    "np.mean(shido)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "child = histories[\"childsum_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\"][\"bleus\"]\n",
    "np.mean(child)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dif = np.array(shido) - np.array(child)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.argsort(dif)[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trues = histories[\"childsumlstm_1layer\"][\"trues\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "childsum = histories[\"childsum_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\"][\"preds\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ours = histories[\"multiway_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\"][\"preds\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "number = histories[\"multiway_dim256_embed256_drop0.5_lr0.001_batch128_epochs30_layer1\"][\"numbers\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in index[:500]:\n",
    "    if \" \".join(trues[i]) != \" \".join(ours[i]):\n",
    "        print(\"GT:   \", \" \".join(trues[i]))\n",
    "        print(\"CSum: \", \" \".join(childsum[i]))\n",
    "        print(\"Ours: \", \" \".join(ours[i]))\n",
    "        print(\"Codes:\\n\" + codes[i])\n",
    "#         print(\"Num:  \", number[i])\n",
    "        print(\"-\" * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trn_data = read_pickle(\"../dataset/nl/train.pkl\")\n",
    "vld_data = read_pickle(\"../dataset/nl/valid.pkl\")\n",
    "tst_data = read_pickle(\"../dataset/nl/test.pkl\")\n",
    "code_i2w = read_pickle(\"../dataset/code_i2w.pkl\")\n",
    "code_w2i = read_pickle(\"../dataset/code_w2i.pkl\")\n",
    "nl_i2w = read_pickle(\"../dataset/nl_i2w.pkl\")\n",
    "nl_w2i = read_pickle(\"../dataset/nl_w2i.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trn_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xx = [traverse_label(read_pickle(t)) for t in tst_x]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = [\" \".join([str(code_w2i[w]) for w in t]) + \"\\n\" for t in xx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = [\" \".join([str(w) for w in t[1:-1]]) + \"\\n\" for t in tst_y]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xy = [xw + \"\\t\" + yw for xw, yw in zip(x, y)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "open(\"x.tst\", \"w\").writelines(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(x) / 32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = open(\"x.tst\", \"r\").read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.split(\"\\n\")[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
